Data Cleaning - Geospatial Data
# specify the directory containing the CSV files
directory = '../../data/quarterly_financials'
# create an empty list to store the dataframes
dfs = []
# loop over the CSV files in the directory
for filename in os.listdir(directory):
if filename.endswith('.csv'):
# read the CSV file into a dataframe and append it to the list
path = os.path.join(directory, filename)
df = pd.read_csv(path)
dfs.append(df)
# concatenate the dataframes into a single dataframe
quarterly_financials = pd.concat(dfs, ignore_index=True)
# sort
quarterly_financials = quarterly_financials.sort_values(by=['REPDTE', 'CERT'], ascending=[False, True])
quarterly_financials = quarterly_financials.reset_index(drop=True)# print the combined dataframe
display(quarterly_financials.shape)
display(quarterly_financials.head())(971541, 15)
| ZIP | BKCLASS | REPDTE | DEPDOM | ASSET | STNAME | EQ | NAME | CITY | ADDRESS | ENDEFYMD | CERT | ESTYMD | LIAB | ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2111 | SM | 20221231 | 163284000.0 | 298020000 | MASSACHUSETTS | 26579000.0 | STATE STREET BANK&TRUST CO | BOSTON | 1 LINCOLN ST | 99991231.0 | 14 | 17920101 | 271441000.0 | 14_20221231 |
| 1 | 36830 | SM | 20221231 | 952037.0 | 1023366 | ALABAMA | 65967.0 | AUBURNBANK | AUBURN | 100 N GAY ST | 99991231.0 | 35 | 19070103 | 957399.0 | 35_20221231 |
| 2 | 36732 | NM | 20221231 | 407949.0 | 444822 | ALABAMA | 32577.0 | ROBERTSON BANKING CO | DEMOPOLIS | 216 N WALNUT AVE | 99991231.0 | 39 | 18700101 | 412245.0 | 39_20221231 |
| 3 | 36867 | NM | 20221231 | 266874.0 | 265272 | ALABAMA | -8465.0 | PHENIX-GIRARD BANK | PHENIX CITY | 801 13TH ST | 99991231.0 | 41 | 19040504 | 273737.0 | 41_20221231 |
| 4 | 36401 | NM | 20221231 | 70649.0 | 76239 | ALABAMA | 5487.0 | BANK OF EVERGREEN | EVERGREEN | 146 W FRONT ST | 99991231.0 | 49 | 19320901 | 70752.0 | 49_20221231 |
display(len(quarterly_financials['BKCLASS'].unique()))
display(quarterly_financials['BKCLASS'].unique())7
array(['SM', 'NM', 'N', 'SI', 'SB', 'SL', 'OI'], dtype=object)
Zip coordinates
# bring in
zip_coordinates = pd.read_csv('../../data/coordinate_data/2022_Gaz_zcta_national.txt', sep='\t')
# remove whitespace in col names
zip_coordinates.columns = [col.strip() for col in zip_coordinates.columns]
# drop unnecesary columns
zip_coordinates.drop(['ALAND', 'ALAND_SQMI', 'AWATER', 'AWATER_SQMI'], axis=1, inplace=True)
# check
display(zip_coordinates.shape)
display(zip_coordinates.head())(33791, 3)
| GEOID | INTPTLAT | INTPTLONG | |
|---|---|---|---|
| 0 | 601 | 18.180555 | -66.749961 |
| 1 | 602 | 18.361945 | -67.175597 |
| 2 | 603 | 18.457399 | -67.124867 |
| 3 | 606 | 18.158327 | -66.932928 |
| 4 | 610 | 18.293960 | -67.127182 |
Combine
# Rename the 'GEOID' column in 'zip_coordinates' to match the 'ZIP' column in 'quarterly_financials'
zip_coordinates = zip_coordinates.rename(columns={'GEOID': 'ZIP'})
# Merge the two dataframes on the 'ZIP' column
merged_df = quarterly_financials.merge(zip_coordinates, on='ZIP', how='left')
# Create a new column 'coordinates' with the combined 'INTPTLAT' and 'INTPTLONG' columns as a tuple
merged_df['zip_coordinates'] = list(zip(merged_df['INTPTLAT'], merged_df['INTPTLONG']))
# Update the 'quarterly_financials' dataframe with the new 'coordinates' column
quarterly_financials = merged_df
# Print the updated 'quarterly_financials' dataframe
display(quarterly_financials.shape)
display(quarterly_financials.head())(971541, 18)
| ZIP | BKCLASS | REPDTE | DEPDOM | ASSET | STNAME | EQ | NAME | CITY | ADDRESS | ENDEFYMD | CERT | ESTYMD | LIAB | ID | INTPTLAT | INTPTLONG | zip_coordinates | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2111 | SM | 20221231 | 163284000.0 | 298020000 | MASSACHUSETTS | 26579000.0 | STATE STREET BANK&TRUST CO | BOSTON | 1 LINCOLN ST | 99991231.0 | 14 | 17920101 | 271441000.0 | 14_20221231 | 42.350680 | -71.060527 | (42.35068, -71.060527) |
| 1 | 36830 | SM | 20221231 | 952037.0 | 1023366 | ALABAMA | 65967.0 | AUBURNBANK | AUBURN | 100 N GAY ST | 99991231.0 | 35 | 19070103 | 957399.0 | 35_20221231 | 32.534872 | -85.493755 | (32.534872, -85.493755) |
| 2 | 36732 | NM | 20221231 | 407949.0 | 444822 | ALABAMA | 32577.0 | ROBERTSON BANKING CO | DEMOPOLIS | 216 N WALNUT AVE | 99991231.0 | 39 | 18700101 | 412245.0 | 39_20221231 | 32.417456 | -87.892213 | (32.417456, -87.892213) |
| 3 | 36867 | NM | 20221231 | 266874.0 | 265272 | ALABAMA | -8465.0 | PHENIX-GIRARD BANK | PHENIX CITY | 801 13TH ST | 99991231.0 | 41 | 19040504 | 273737.0 | 41_20221231 | 32.498054 | -85.023590 | (32.498054, -85.02359) |
| 4 | 36401 | NM | 20221231 | 70649.0 | 76239 | ALABAMA | 5487.0 | BANK OF EVERGREEN | EVERGREEN | 146 W FRONT ST | 99991231.0 | 49 | 19320901 | 70752.0 | 49_20221231 | 31.468970 | -86.950426 | (31.46897, -86.950426) |
# removed unmatched zips
quarterly_financials.dropna(subset=['INTPTLAT'], inplace=True)
# drop unnecesary columns
quarterly_financials.drop(['INTPTLAT', 'INTPTLONG', 'ENDEFYMD', 'CERT', 'ESTYMD', 'LIAB', 'ID', 'ADDRESS', 'EQ', 'ZIP'], axis=1, inplace=True)
# convert numerical values to millions (source is in thousands)
quarterly_financials['DEPDOM'] = quarterly_financials['DEPDOM'].divide(1000)
quarterly_financials['ASSET'] = quarterly_financials['ASSET'].divide(1000)
# rename
quarterly_financials.rename(columns={
'BKCLASS': 'bank_class',
'REPDTE': 'report_date',
'DEPDOM': 'deposits_mill',
'ASSET': 'assets_mill',
'STNAME': 'state',
'NAME': 'name',
'CITY': 'city'
}, inplace=True)
display(quarterly_financials.head())| bank_class | report_date | deposits_mill | assets_mill | state | name | city | zip_coordinates | |
|---|---|---|---|---|---|---|---|---|
| 0 | SM | 20221231 | 163284.000 | 298020.000 | MASSACHUSETTS | STATE STREET BANK&TRUST CO | BOSTON | (42.35068, -71.060527) |
| 1 | SM | 20221231 | 952.037 | 1023.366 | ALABAMA | AUBURNBANK | AUBURN | (32.534872, -85.493755) |
| 2 | NM | 20221231 | 407.949 | 444.822 | ALABAMA | ROBERTSON BANKING CO | DEMOPOLIS | (32.417456, -87.892213) |
| 3 | NM | 20221231 | 266.874 | 265.272 | ALABAMA | PHENIX-GIRARD BANK | PHENIX CITY | (32.498054, -85.02359) |
| 4 | NM | 20221231 | 70.649 | 76.239 | ALABAMA | BANK OF EVERGREEN | EVERGREEN | (31.46897, -86.950426) |
# change to title case
quarterly_financials[['state', 'name', 'city']] = quarterly_financials[['state', 'name', 'city']].apply(lambda x: x.str.title())
# format date
quarterly_financials['report_date'] = pd.to_datetime(quarterly_financials['report_date'], format='%Y%m%d')
# set to float
quarterly_financials['deposits_mill'] = quarterly_financials['deposits_mill'].astype(float)
quarterly_financials['assets_mill'] = quarterly_financials['assets_mill'].astype(float)
display(quarterly_financials.head())| bank_class | report_date | deposits_mill | assets_mill | state | name | city | zip_coordinates | |
|---|---|---|---|---|---|---|---|---|
| 0 | SM | 2022-12-31 | 163284.000 | 298020.000 | Massachusetts | State Street Bank&Trust Co | Boston | (42.35068, -71.060527) |
| 1 | SM | 2022-12-31 | 952.037 | 1023.366 | Alabama | Auburnbank | Auburn | (32.534872, -85.493755) |
| 2 | NM | 2022-12-31 | 407.949 | 444.822 | Alabama | Robertson Banking Co | Demopolis | (32.417456, -87.892213) |
| 3 | NM | 2022-12-31 | 266.874 | 265.272 | Alabama | Phenix-Girard Bank | Phenix City | (32.498054, -85.02359) |
| 4 | NM | 2022-12-31 | 70.649 | 76.239 | Alabama | Bank Of Evergreen | Evergreen | (31.46897, -86.950426) |
quarterly_financials['bank_class'] = quarterly_financials['bank_class'].replace({
'N': 'Commercial bank, national charter, Fed member',
'NM': 'Commercial bank, state charter, Fed non-member',
'OI': 'Insured U.S. branch of a foreign chartered institution',
'SB': 'Federal savings banks',
'SI': 'State chartered stock savings banks',
'SL': 'State chartered stock savings and loan association',
'SM': 'Commercial bank, state charter, Fed member',
'NC': 'Noninsured non-deposit commercial bank',
'NS': 'Noninsured stock savings bank',
'CU': 'State or federally chartered credit union',
})
display(quarterly_financials.shape)
display(quarterly_financials.head())
(937293, 8)
| bank_class | report_date | deposits_mill | assets_mill | state | name | city | zip_coordinates | |
|---|---|---|---|---|---|---|---|---|
| 0 | Commercial bank, state charter, Fed member | 2022-12-31 | 163284.000 | 298020.000 | Massachusetts | State Street Bank&Trust Co | Boston | (42.35068, -71.060527) |
| 1 | Commercial bank, state charter, Fed member | 2022-12-31 | 952.037 | 1023.366 | Alabama | Auburnbank | Auburn | (32.534872, -85.493755) |
| 2 | Commercial bank, state charter, Fed non-member | 2022-12-31 | 407.949 | 444.822 | Alabama | Robertson Banking Co | Demopolis | (32.417456, -87.892213) |
| 3 | Commercial bank, state charter, Fed non-member | 2022-12-31 | 266.874 | 265.272 | Alabama | Phenix-Girard Bank | Phenix City | (32.498054, -85.02359) |
| 4 | Commercial bank, state charter, Fed non-member | 2022-12-31 | 70.649 | 76.239 | Alabama | Bank Of Evergreen | Evergreen | (31.46897, -86.950426) |
Growth of Assets with Slider
from ipywidgets.embed import embed_minimal_html
import branca
from IPython.display import HTML
gdf = gpd.GeoDataFrame(quarterly_financials, geometry=gpd.points_from_xy(quarterly_financials.zip_coordinates.apply(lambda p: p[1]), quarterly_financials.zip_coordinates.apply(lambda p: p[0])))
def plot_bank_assets_by_date(date):
# Filter the GeoDataFrame by the selected date
filtered_gdf = gdf[gdf['report_date'] == date]
# Group the filtered GeoDataFrame by state and sum the assets
state_assets = filtered_gdf.groupby('state')['assets_mill'].sum().reset_index()
# Create the base folium map
m = folium.Map(location=[37.8, -96], zoom_start=4)
# Define a function to scale the assets to a suitable size for the map
def scale_bubble_size(assets):
return assets / 50000
# Plot bubbles for each state with a size proportional to the total assets
for index, row in state_assets.iterrows():
state_data = filtered_gdf[filtered_gdf['state'] == row['state']]
state_centroid = state_data.unary_union.centroid
folium.CircleMarker(
location=[state_centroid.y, state_centroid.x],
radius=scale_bubble_size(row['assets_mill']),
color='blue',
fill=True,
fill_color='blue',
fill_opacity=0.5,
popup=f"State: {row['state']}<br>Total Assets: {row['assets_mill']:.0f}<br>Date: {date}"
).add_to(m)
# Display the map
return m
# Get the unique report dates sorted
unique_dates = sorted(quarterly_financials['report_date'].unique())
# Create the index slider
index_slider = widgets.IntSlider(
min=0,
max=len(unique_dates) - 1,
description='Date Index:',
continuous_update=False,
)
# Create the play button
play_button = widgets.Play(
interval=100, # Time in milliseconds between updates
value=0, # Slider starting value
min=0,
max=len(unique_dates) - 1,
step=1,
description="Press play",
disabled=False
)
# Link the play button to the index slider
widgets.jslink((play_button, 'value'), (index_slider, 'value'))
# Create a horizontal box to display the slider and play button together
slider_with_play = widgets.HBox([index_slider, play_button])
def display_map(index):
date = unique_dates[index]
m = plot_bank_assets_by_date(date)
display(m)
# Use widgets.interactive() and assign the output to a variable
interactive_map = widgets.interactive(display_map, index=index_slider)
# Clear the output of interactive_map (to prevent double display of the map)
interactive_map.update()
# Display the slider with the play button and the interactive map
display(slider_with_play)
display(interactive_map.children[-1])
import folium
import pandas as pd
import geopandas as gpd
from folium.plugins import HeatMap
from ipywidgets import widgets
from IPython.display import display, IFrame
gdf = gpd.GeoDataFrame(quarterly_financials, geometry=gpd.points_from_xy(quarterly_financials.zip_coordinates.apply(lambda p: p[1]), quarterly_financials.zip_coordinates.apply(lambda p: p[0])))
def plot_bank_assets_by_date(date):
# Filter the GeoDataFrame by the selected date
filtered_gdf = gdf[gdf['report_date'] == date]
# Group the filtered GeoDataFrame by state and sum the assets
state_assets = filtered_gdf.groupby('state')['assets_mill'].sum().reset_index()
# Create the base folium map
m = folium.Map(location=[37.8, -96], zoom_start=4)
# Define a function to scale the assets to a suitable size for the map
def scale_bubble_size(assets):
return assets / 50000
# Plot bubbles for each state with a size proportional to the total assets
for index, row in state_assets.iterrows():
state_data = filtered_gdf[filtered_gdf['state'] == row['state']]
state_centroid = state_data.unary_union.centroid
folium.CircleMarker(
location=[state_centroid.y, state_centroid.x],
radius=scale_bubble_size(row['assets_mill']),
color='blue',
fill=True,
fill_color='blue',
fill_opacity=0.5,
popup=f"State: {row['state']}<br>Total Assets: {row['assets_mill']:.0f}<br>Date: {date}"
).add_to(m)
return m
# Get the unique report dates sorted
unique_dates = sorted(quarterly_financials['report_date'].unique())
# Create the index slider
index_slider = widgets.IntSlider(
min=0,
max=len(unique_dates) - 1,
description='Date Index:',
continuous_update=False,
)
# Create the play button
play_button = widgets.Play(
interval=100, # Time in milliseconds between updates
value=0, # Slider starting value
min=0,
max=len(unique_dates) - 1,
step=1,
description="Press play",
disabled=False
)
# Link the play button to the index slider
widgets.jslink((play_button, 'value'), (index_slider, 'value'))
# Create a horizontal box to display the slider and play button together
slider_with_play = widgets.HBox([index_slider, play_button])
def display_map(index):
date = unique_dates[index]
m = plot_bank_assets_by_date(date)
display(IFrame(m._repr_html_(), width=700, height=600))
# Use widgets.interactive() and assign the output to a variable
interactive_map = widgets.interactive(display_map, index=index_slider)
# Clear the output of interactive_map (to prevent double display of the map)
interactive_map.update()
# Display the slider with the play button and the interactive map
display(slider_with_play)
display(interactive_map.children[-1])Individual Points
Heatmap all dates
Latest date only
import folium
from folium.plugins import HeatMap
import pandas as pd
from IPython.display import IFrame
# Find the latest date
latest_date = quarterly_financials['report_date'].max()
# Filter the DataFrame by the latest date
latest_quarterly_financials = quarterly_financials[quarterly_financials['report_date'] == latest_date]
# Create a base map
map = folium.Map(location=[37.8, -96], zoom_start=4)
# Prepare the data for the HeatMap
heatmap_data = []
for index, row in latest_quarterly_financials.iterrows():
coordinates = row['zip_coordinates']
assets_mill = row['assets_mill']
heatmap_data.append((*coordinates, assets_mill))
# Add the HeatMap to the map
heatmap = HeatMap(heatmap_data, radius=13, max_zoom=13, gradient={0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'})
map.add_child(heatmap)
# For rendering on Quarto
map.save('heatmap_latest_date.html')
IFrame(src='heatmap_latest_date.html', width=700, height=600)
# For rendering in Jupyter Notebook
display(map)Make this Notebook Trusted to load map: File -> Trust Notebook
import folium
from folium.plugins import HeatMap
import pandas as pd
from IPython.display import IFrame
# Find the latest date
latest_date = quarterly_financials['report_date'].max()
# Filter the DataFrame by the latest date
latest_quarterly_financials = quarterly_financials[quarterly_financials['report_date'] == latest_date]
# Create a base map
map = folium.Map(location=[37.8, -96], zoom_start=4)
# Prepare the data for the HeatMap
heatmap_data = []
for index, row in latest_quarterly_financials.iterrows():
coordinates = row['zip_coordinates']
assets_mill = row['assets_mill']
heatmap_data.append((*coordinates, assets_mill))
# Add the HeatMap to the map
heatmap = HeatMap(heatmap_data, radius=10, max_zoom=13, gradient={0.0: '#ffffb2', 0.25: '#fecc5c', 0.5: '#fd8d3c', 0.75: '#f03b20', 1.0: '#bd0026'}) # YlOrBr color scheme
map.add_child(heatmap)
# Function to scale the assets to a suitable size for the map
def scale_bubble_size(assets):
return assets / 50000
# Add CircleMarkers with tooltips to the map
for index, row in latest_quarterly_financials.iterrows():
coordinates = row['zip_coordinates']
assets_mill = row['assets_mill']
bank_name = row['name']
tooltip_text = f"Bank: {bank_name}<br>Coordinates: {coordinates}<br>Assets (millions): {assets_mill}"
folium.CircleMarker(
location=coordinates,
radius=scale_bubble_size(assets_mill),
color='blue',
fill=True,
fill_color='blue',
fill_opacity=0.5,
tooltip=tooltip_text,
).add_to(map)
# For rendering on Quarto
map.save('heatmap.html')
IFrame(src='heatmap.html', width=700, height=600)
# For rendering in Jupyter Notebook
display(map)Make this Notebook Trusted to load map: File -> Trust Notebook